1 About
This book sets out the analysis of somite development periods, for Ali Ahmed Seleit in the Aulehla Group at EMBL Heidelberg.
(#fig:unnamed-chunk-1)Video by Ali Ahmed Seleit
2 Alignment of F0 and F2 generations
All the code used to process the data using Snakemake1 can be found here:
3 Homozygosity of F0 Cab and Kaga
Snakefile for aligning F0 samples: https://github.com/brettellebi/somites/blob/master/workflow/rules/02_F0_mapping.smk
Snakefile for calling F0 samples: https://github.com/brettellebi/somites/blob/master/workflow/rules/03_F0_calling.smk
3.1 Read in total medaka genome count
# Get chromosome lengths
med_chr_lens = read.table(here::here("data",
"Oryzias_latipes.ASM223467v1.dna.toplevel.fa_chr_counts.txt"),
col.names = c("chr", "end"))
# Add start
med_chr_lens$start = 1
# Reorder
med_chr_lens = med_chr_lens %>%
dplyr::select(chr, start, end) %>%
# remove MT
dplyr::filter(chr != "MT")
# Total HdrR sequence length
total_hdrr_bases = sum(med_chr_lens$end)Make custom chromosome scaffold
##Create custom genome
med_genome = regioneR::toGRanges(med_chr_lens)3.2 Read in data
in_dir = "/nfs/research/birney/users/ian/somites/recombination_blocks"
in_files = list.files(in_dir, pattern = "20210803_hmm_output_F0", full.names = T)
# Read into list
ck_list = purrr::map(in_files, function(FILE){
out = readr::read_tsv(FILE,
col_types = "ciiidii")
})
# Set names as bin length
names(ck_list) = basename(in_files) %>%
stringr::str_split("_", simplify = T) %>%
subset(select = 6) %>%
stringr::str_remove(".txt")
# Reorder
ck_list = ck_list[order(as.numeric(names(ck_list)))]
counter = 0
ck_list = purrr::map(ck_list, function(data){
counter <<- counter + 1
# set bin length
bin_length = as.numeric(names(ck_list)[counter])
# add bin start and end coordinates
df = data %>%
dplyr::mutate(LANE = sample %>%
stringr::str_split("/", simplify = T) %>%
subset(select = 10),
BIN_LENGTH = bin_length,
BIN_START = (bin - 1) * bin_length + 1,
BIN_END = bin * bin_length,
BIN_LENGTH_KB = BIN_LENGTH / 1e3,
READS_PER_BIN = mat + pat)
return(df)
})
# Recode `ck_list$state` so that 0,1,2 corresponds to HOM_REF, HET, HOM_ALT
ck_list = purrr::map(ck_list, function(df){
df = df %>%
dplyr::mutate(state = dplyr::recode(state,
`0` = 2,
`1` = 1,
`2` = 0))
})3.2.1 Get total number of bases covered by each state
# Take 5kb DF
df = ck_list$`5000`
# Set states to loop over
states = 0:2
names(states) = states
# Run loop over each LANE
base_cov_df = df %>%
split(., f = .$LANE) %>%
purrr::map(., function(LANE){
# convert to ranges object
lane_ranges = GenomicRanges::makeGRangesFromDataFrame(LANE,
keep.extra.columns = T,
ignore.strand = T,
seqnames.field = "chr",
start.field = "BIN_START",
end.field = "BIN_END")
# get total bases covered by each state
purrr::map_dfr(states, function(STATE){
lane_ranges[lane_ranges$state == STATE] %>%
# merge contiguous ranges
GenomicRanges::reduce(.) %>%
# get width of ranges
width(.) %>%
# get total bases covered
sum(.) %>%
# coerce into data frame
data.frame("BASES_COVERED" = .)
}, .id = "STATE") %>%
# add FREQ column
dplyr::mutate(FREQ = BASES_COVERED / total_hdrr_bases) %>%
# add UNCLASSIFIED row
tibble::add_row(STATE = "UNCLASSIFIED",
BASES_COVERED = total_hdrr_bases - sum(.$BASES_COVERED),
FREQ = (total_hdrr_bases - sum(.$BASES_COVERED)) / total_hdrr_bases)
}
) %>%
dplyr::bind_rows(.id = "LANE")Plot
# Plot
ck_prop_plot = base_cov_df %>%
dplyr::mutate(STATE = factor(STATE, levels = c(0,1,2, "UNCLASSIFIED")),
#STATE_RECODE = dplyr::recode(STATE,
# `0` = "HOM REF (HdrR)",
# `1` = "HET",
# `2` = "HOM ALT",
# "UNCLASSIFIED" = "Unclassified")
) %>%
# plot
ggplot(aes(STATE, FREQ, colour = STATE, fill = STATE)) +
geom_col() +
facet_grid(cols = vars(LANE)) +
theme_bw() +
scale_colour_manual(values = pal_hom_het_2_lines) +
scale_fill_manual(values = pal_hom_het_2) +
guides(colour = "none", fill = "none") +
xlab("Genotype") +
ylab("Proportion of reference bases covered")
ck_prop_plot
# Interactive version
ggplotly(ck_prop_plot)3.3 Karyoplot
bb_list_ck = purrr::map(ck_list, function(df){
# loop over different bin lengths
block_bounds_list = df %>%
# loop over LANE
split(., f = .$LANE) %>%
purrr::map(., function(LANE){
# loop over CHR
LANE %>%
split(., f = .$chr) %>%
purrr::map(., function(CHR){
# Get lengths of each contiguous state
cont_len = rle(CHR$state)
# Get cumulative sum of those lengths
cum_blocks = cumsum(cont_len$lengths)
# Get rows that correspond to block changes
block_bounds = CHR[cum_blocks, ] %>%
# Add end of previous black
dplyr::mutate(END_PREV = dplyr::lag(BIN_END)) %>%
# Replace the NA in the first row with `1`
dplyr::mutate(END_PREV = tidyr::replace_na(END_PREV, 1)) %>%
# Add colour
dplyr::mutate(COLOUR = dplyr::recode(state,
!!!pal_hom_het_2[-which(names(pal_hom_het_2) == "UNCLASSIFIED")]))
}) %>%
dplyr::bind_rows()
})
})Extract y cutoff points for each y
lc_list_ck = purrr::map(bb_list_ck, function(block_bounds_list){
lane_cutoffs = cut(0:1, breaks = length(block_bounds_list)) %>%
levels(.) %>%
data.frame(lower = as.numeric( sub("\\((.+),.*", "\\1", .) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", .) )) %>%
dplyr::arrange(dplyr::desc(lower))
return(lane_cutoffs)
})Plot Karyoplots
counter_A = 0
purrr::map(bb_list_ck, function(block_bounds_list){
counter_A <<- counter_A + 1
# set file name
file_name = paste("20210803_ck_karyoplot_", names(bb_list_ck)[counter_A], ".png", sep = "")
file_out = here::here("docs/plots", file_name)
# Get lane cutoffs
lane_cutoffs = lc_list_ck[[counter_A]]
png(file=file_out,
width=13000,
height=3000,
units = "px",
res = 300)
# Plot ideogram
kp = karyoploteR::plotKaryotype(med_genome, plot.type = 5)
# Add data background
#karyoploteR::kpDataBackground(kp, r0=0, r1 = 1, color = "white")
# Add rectangles in loop
counter_B = 0
purrr::map(block_bounds_list, function(LANE){
# Add to counter_B
counter_B <<- counter_B + 1
# Add rectangles
karyoploteR::kpRect(kp,
chr = LANE$chr,
x0 = LANE$END_PREV,
x1 = LANE$BIN_END,
y0 = lane_cutoffs[counter_B, ] %>%
dplyr::pull(lower),
y1 = lane_cutoffs[counter_B, ] %>%
dplyr::pull(upper),
col = LANE$COLOUR,
border = NA)
# Add axis label
karyoploteR::kpAddLabels(kp, labels = unique(LANE$LANE),
r0 = lane_cutoffs[counter_B, ] %>%
dplyr::pull(lower),
r1 = lane_cutoffs[counter_B, ] %>%
dplyr::pull(upper),
cex = 0.5)
})
dev.off()
})knitr::include_graphics(here::here("book/plots/20210803_ck_karyoplot_5000.png"))
(#fig:unnamed-chunk-10)Bin length: 5 kb
3.4 20211110 update
Changes:
- Used homozgygous-divergent Cab-Kaga sites instead of all sites in F0 VCF
- Filtered out reads that overlapped repeat regions
3.4.1 Read in data
in_dir = "/nfs/research/birney/users/ian/somites/recombination_blocks/F0/no_repeat_reads"
in_files = list.files(in_dir, full.names = T)
# Read into list
ck_list = purrr::map(in_files, function(FILE){
out = readr::read_tsv(FILE,
col_types = "ciiidii")
})
# Set names as bin length
names(ck_list) = basename(in_files) %>%
stringr::str_remove(".txt")
# Reorder
ck_list = ck_list[order(as.numeric(names(ck_list)))]
counter = 0
ck_list = purrr::map(ck_list, function(data){
counter <<- counter + 1
# set bin length
bin_length = as.numeric(names(ck_list)[counter])
# add bin start and end coordinates
df = data %>%
dplyr::mutate(LANE = sample %>%
basename() %>%
stringr::str_remove(".txt"),
BIN_LENGTH = bin_length,
BIN_START = (bin - 1) * bin_length + 1,
BIN_END = bin * bin_length,
BIN_LENGTH_KB = BIN_LENGTH / 1e3,
READS_PER_BIN = mat + pat)
return(df)
})
# Recode `ck_list$state` so that 0,1,2 corresponds to Cab, Het, Kaga
ck_list = purrr::map(ck_list, function(df){
df = df %>%
dplyr::mutate(state = dplyr::recode(state,
`0` = 2,
`1` = 1,
`2` = 0))
})3.4.2 Get total number of bases covered by each state
# Take 5kb DF
df = ck_list$`5000`
# Set states to loop over
states = 0:2 ; names(states) = states
# Run loop over each LANE
base_cov_df = df %>%
split(., f = .$LANE) %>%
purrr::map(., function(LANE){
# convert to ranges object
lane_ranges = GenomicRanges::makeGRangesFromDataFrame(LANE,
keep.extra.columns = T,
ignore.strand = T,
seqnames.field = "chr",
start.field = "BIN_START",
end.field = "BIN_END")
# get total bases covered by each state
purrr::map_dfr(states, function(STATE){
lane_ranges[lane_ranges$state == STATE] %>%
# merge contiguous ranges
GenomicRanges::reduce(.) %>%
# get width of ranges
width(.) %>%
# get total bases covered
sum(.) %>%
# coerce into data frame
data.frame("BASES_COVERED" = .)
}, .id = "STATE") %>%
# add FREQ column
dplyr::mutate(FREQ = BASES_COVERED / total_hdrr_bases) %>%
# add UNCLASSIFIED row
tibble::add_row(STATE = "UNCLASSIFIED",
BASES_COVERED = total_hdrr_bases - sum(.$BASES_COVERED),
FREQ = (total_hdrr_bases - sum(.$BASES_COVERED)) / total_hdrr_bases)
}
) %>%
dplyr::bind_rows(.id = "LANE")Plot
# Plot
ck_prop_plot = base_cov_df %>%
dplyr::mutate(STATE = factor(STATE, levels = c(0,1,2, "UNCLASSIFIED"))) %>%
# plot
ggplot(aes(STATE, FREQ, colour = STATE, fill = STATE)) +
geom_col() +
facet_grid(cols = vars(LANE)) +
theme_bw(base_size = 9) +
scale_colour_manual(values = pal_hom_het_2_lines) +
scale_fill_manual(values = pal_hom_het_2) +
guides(colour = "none", fill = "none") +
xlab("Genotype") +
ylab("Proportion of reference bases covered")
ck_prop_plot
# Interactive version
ggplotly(ck_prop_plot)3.4.3 Karyoplot
bb_list_ck = purrr::map(ck_list, function(df){
# loop over different bin lengths
block_bounds_list = df %>%
# loop over LANE
split(., f = .$LANE) %>%
purrr::map(., function(LANE){
# loop over CHR
LANE %>%
split(., f = .$chr) %>%
purrr::map(., function(CHR){
# Get lengths of each contiguous state
cont_len = rle(CHR$state)
# Get cumulative sum of those lengths
cum_blocks = cumsum(cont_len$lengths)
# Get rows that correspond to block changes
block_bounds = CHR[cum_blocks, ] %>%
# Add end of previous black
dplyr::mutate(END_PREV = dplyr::lag(BIN_END)) %>%
# Replace the NA in the first row with `1`
dplyr::mutate(END_PREV = tidyr::replace_na(END_PREV, 1)) %>%
# Add colour
dplyr::mutate(COLOUR = dplyr::recode(state,
!!!pal_hom_het_2[-which(names(pal_hom_het_2) == "UNCLASSIFIED")]))
}) %>%
dplyr::bind_rows()
})
})Extract y cutoff points for each y
lc_list_ck = purrr::map(bb_list_ck, function(block_bounds_list){
lane_cutoffs = cut(0:1, breaks = length(block_bounds_list)) %>%
levels(.) %>%
data.frame(lower = as.numeric( sub("\\((.+),.*", "\\1", .) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", .) )) %>%
dplyr::arrange(dplyr::desc(lower))
return(lane_cutoffs)
})Plot Karyoplots
counter_A = 0
purrr::map(bb_list_ck, function(block_bounds_list){
counter_A <<- counter_A + 1
# set file name
file_name = paste("20211110_ck_karyoplot_", names(bb_list_ck)[counter_A], ".png", sep = "")
file_out = here::here("book/plots", file_name)
# Get lane cutoffs
lane_cutoffs = lc_list_ck[[counter_A]]
png(file=file_out,
width=13000,
height=3000,
units = "px",
res = 300)
# Plot ideogram
kp = karyoploteR::plotKaryotype(med_genome, plot.type = 5)
# Add data background
#karyoploteR::kpDataBackground(kp, r0=0, r1 = 1, color = "white")
# Add rectangles in loop
counter_B = 0
purrr::map(block_bounds_list, function(LANE){
# Add to counter_B
counter_B <<- counter_B + 1
# Add rectangles
karyoploteR::kpRect(kp,
chr = LANE$chr,
x0 = LANE$END_PREV,
x1 = LANE$BIN_END,
y0 = lane_cutoffs[counter_B, ] %>%
dplyr::pull(lower),
y1 = lane_cutoffs[counter_B, ] %>%
dplyr::pull(upper),
col = LANE$COLOUR,
border = NA)
# Add axis label
karyoploteR::kpAddLabels(kp, labels = unique(LANE$LANE),
r0 = lane_cutoffs[counter_B, ] %>%
dplyr::pull(lower),
r1 = lane_cutoffs[counter_B, ] %>%
dplyr::pull(upper),
cex = 0.5)
})
dev.off()
})knitr::include_graphics(here::here("book/plots/20211110_ck_karyoplot_5000.png"))
(#fig:unnamed-chunk-17)Bin length: 5 kb
3.4.4 Without filling in empty blocks
counter = 0
bb_list_ck_wunc = purrr::map(ck_list, function(df){
counter <<- counter + 1
BIN_LENGTH = names(ck_list)[counter] %>%
as.numeric()
# loop over different bin lengths
block_bounds_list = df %>%
# loop over LANE
split(., f = .$LANE) %>%
purrr::map(., function(LANE){
STRAIN = unique(LANE$LANE)
# Create list of possible bins
poss_bins = purrr::map(med_chr_lens$chr, function(CHR){
# Get chr end
CHR_END = med_chr_lens %>%
dplyr::filter(chr == CHR) %>%
dplyr::pull(end) %>%
as.numeric()
# Get bin starts
out = tibble::tibble(chr = as.numeric(CHR),
BIN_START = seq(from = 1, to = CHR_END, by = BIN_LENGTH),
BIN_END = BIN_START + BIN_LENGTH - 1
)
# Adjust final bin end
out[nrow(out), "BIN_END"] = CHR_END
return(out)
}) %>%
dplyr::bind_rows()
# Bind DF
new_df = dplyr::left_join(poss_bins,
LANE %>%
dplyr::select(chr, BIN_START, BIN_END, state),
by = c("chr", "BIN_START", "BIN_END")) %>%
# replace NAs with `UNCLASSIFIED`
dplyr::mutate(state = state %>%
tidyr::replace_na("UNCLASSIFIED"),
# add STRAIN
LANE = STRAIN) %>%
# add COLOUR
dplyr::mutate(COLOUR = dplyr::recode(state,
!!!pal_hom_het_2))
})
})Plot Karyoplots
counter_A = 0
purrr::map(bb_list_ck_wunc, function(block_bounds_list){
counter_A <<- counter_A + 1
# set file name
file_name = paste("20211110_ck_karyoplot_wimiss_", names(bb_list_ck_wunc)[counter_A], ".png", sep = "")
file_out = here::here("book/plots", file_name)
# Get lane cutoffs
lane_cutoffs = lc_list_ck[[counter_A]]
png(file=file_out,
width=13000,
height=3000,
units = "px",
res = 300)
# Plot ideogram
kp = karyoploteR::plotKaryotype(med_genome, plot.type = 5)
# Add data background
#karyoploteR::kpDataBackground(kp, r0=0, r1 = 1, color = "white")
# Add rectangles in loop
counter_B = 0
purrr::map(block_bounds_list, function(LANE){
# Add to counter_B
counter_B <<- counter_B + 1
# Add rectangles
karyoploteR::kpRect(kp,
chr = LANE$chr,
x0 = LANE$BIN_START,
x1 = LANE$BIN_END,
y0 = lane_cutoffs[counter_B, ] %>%
dplyr::pull(lower),
y1 = lane_cutoffs[counter_B, ] %>%
dplyr::pull(upper),
col = LANE$COLOUR,
border = NA)
# Add axis label
karyoploteR::kpAddLabels(kp, labels = unique(LANE$LANE),
r0 = lane_cutoffs[counter_B, ] %>%
dplyr::pull(lower),
r1 = lane_cutoffs[counter_B, ] %>%
dplyr::pull(upper),
cex = 3.5)
})
dev.off()
})knitr::include_graphics(here::here("book/plots/20211110_ck_karyoplot_wimiss_5000.png"))
(#fig:unnamed-chunk-20)Bin length: 5 kb
4 F2 recombination blocks (all homozygous-divergent sites)
Snakefile for aligning F2 samples: https://github.com/brettellebi/somites/blob/master/workflow/rules/04_F2_mapping.smk Snakefile for running HMM and generating figures: https://github.com/brettellebi/somites/blob/master/workflow/rules/05_F2_recomb_blocks.smk
library(here)
#> here() starts at /hps/software/users/birney/ian/repos/somites
site_filter = "all_sites"4.1 Base coverage
4.1.1 Total
knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/base_cov_total.png"))
4.1.2 By chromosome
knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/base_cov_by_chrom.png"))
4.2 Proportion of sites
4.2.1 Total
knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/prop_sites_total.png"))
4.2.2 By chromosome
knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/prop_sites_by_chrom.png"))
4.3 Karyoplots
4.3.1 No missing blocks
knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/karyoplot_no_missing.png"))
4.3.2 With missing blocks
knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/karyoplot_with_missing.png"))
5 F2 recombination blocks (filtered sites)
Exclusions: * reads overlapping HdrR repeat regions * regions of persistent heterozygosity in the MIKK panel * filtered based on read count and proportion of Cab)
Snakefile for aligning F2 samples: https://github.com/brettellebi/somites/blob/master/workflow/rules/04_F2_mapping.smk
Snakefile for running HMM and generating figures: https://github.com/brettellebi/somites/blob/master/workflow/rules/05_F2_recomb_blocks.smk
library(here)
#> here() starts at /hps/software/users/birney/ian/repos/somites
site_filter = "no_repeat_reads_or_pers_hets_filtered_for_read_count_and_cab_prop"5.1 Base coverage
5.1.1 Total
knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/base_cov_total.png"))
5.1.2 By chromosome
knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/base_cov_by_chrom.png"))
5.2 Proportion of sites
5.2.1 Total
knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/prop_sites_total.png"))
5.2.2 By chromosome
knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/prop_sites_by_chrom.png"))
5.3 Karyoplots
5.3.1 No missing blocks
knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/karyoplot_no_missing.png"))
5.3.2 With missing blocks
knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/karyoplot_with_missing.png"))
6 Association testing with simulated phenotypes
Code for generating simulated phenotypes is set out below.
The simulated phenotypes are then used in this Snakefile for testing GWLS code, specifically rule test_gwls: https://github.com/brettellebi/somites/blob/master/workflow/rules/07_assocation_testing.smk
library(here)
source(here::here("book/source/04-Association_testing.R"))6.1 Read in F2 recombination blocks
6.1.1 Read data
in_dir = "/nfs/research/birney/users/ian/somites/recombination_blocks/20211027"
in_files = list.files(in_dir, pattern = "F2_", full.names = T)
# Read into list
data_list = purrr::map(in_files, function(FILE){
out = readr::read_tsv(FILE,
col_types = "ciiidii")
})
# Set names as bin length
names(data_list) = basename(in_files) %>%
stringr::str_split("_", simplify = T) %>%
subset(select = 2) %>%
stringr::str_remove(".txt")
# Reorder
data_list = data_list[order(as.numeric(names(data_list)))]
counter = 0
df_list = purrr::map(data_list, function(data){
counter <<- counter + 1
# set bin length
bin_length = as.numeric(names(data_list)[counter])
# add bin start and end coordinates
df = data %>%
dplyr::mutate(SAMPLE = basename(sample) %>%
stringr::str_remove(".txt") %>%
as.numeric(.),
BIN_START = (bin - 1) * bin_length + 1,
BIN_END = bin * bin_length) %>%
# recode state to make 0 == "Cab"
dplyr::mutate(STATE = dplyr::recode(state,
`0` = 2,
`1` = 1,
`2` = 0)) %>%
dplyr::select(SAMPLE, CHROM = chr, BIN = bin, BIN_START, BIN_END, STATE)
return(df)
})6.1.2 How many possible blocks have at least one call?
6.1.2.1 Count total existing bins
bin_lengths = as.integer(names(df_list))
names(bin_lengths) = bin_lengths
n_bins = purrr::map(bin_lengths, function(BIN_LENGTH){
purrr::map_int(med_chr_lens$end, function(CHR_END){
out = seq(from = 1, to = CHR_END, by = BIN_LENGTH)
return(length(out))
})
})
n_bins_total = purrr::map_int(n_bins, sum)6.1.2.2 Proportion of total bins with calls
# Get total number of samples
n_samples = df_list$`5000`$SAMPLE %>%
unique(.) %>%
length(.)
# Build DF of bins
n_bins_df = purrr::map_dfr(1:length(df_list), function(COUNTER){
# Bin length
bin_length = as.numeric(names(df_list)[COUNTER])
# Number of total bins
n_bins = n_bins_total[COUNTER]
# Number of bins with calls
n_bins_with_calls = df_list[[COUNTER]] %>%
dplyr::distinct(CHROM, BIN_START) %>%
nrow(.)
# Number of bins with calls for each
n_bins_no_missing = df_list[[COUNTER]] %>%
dplyr::count(CHROM, BIN_START) %>%
dplyr::filter(n == n_samples) %>%
nrow(.)
# Build final data frame
out = tibble::tibble(BIN_LENGTH = bin_length,
N_BINS = n_bins,
N_BINS_WITH_CALLS = n_bins_with_calls,
N_BINS_NO_MISSING = n_bins_no_missing) %>%
dplyr::mutate(PROP_BINS_WITH_CALLS = N_BINS_WITH_CALLS / N_BINS,
PROP_BINS_NO_MISSING = N_BINS_NO_MISSING / N_BINS )
return(out)
})
DT::datatable(n_bins_df)6.1.3 Merge recombination blocks
6.1.3.1 List with final recoded genotypes (including NAs)
gt_list = purrr::map(df_list, function(BIN_LENGTH_DF){
# Widen data frame
gt_final = BIN_LENGTH_DF %>%
tidyr::pivot_wider(names_from = SAMPLE, values_from = STATE)
# Pull out matrix of genotypes
gt_mat = as.matrix(gt_final[, 5:ncol(gt_final)])
# Get indexes of loci with > 1 genotype
bins_to_keep = logical()
for (ROW in 1:nrow(gt_mat)){
# get unique values in each row
out = unique(gt_mat[ROW, ])
# remove NAs
out = out[!is.na(out)]
# if more than one value, return TRUE
if (length(out) > 1) {
bins_to_keep[ROW] = TRUE
}
# if just one value (i.e. if all samples are the same genotype at that locus), return false
else {
bins_to_keep[ROW] = FALSE
}
}
# filter gt_final
gt_filt = gt_final %>%
dplyr::filter(bins_to_keep) %>%
# recode genotypes to -1, 0, 1
dplyr::mutate(dplyr::across(-c("CHROM", "BIN", "BIN_START", "BIN_END"),
~dplyr::recode(.x,
`0` = -1,
`1` = 0,
`2` = 1))) %>%
# order
dplyr::arrange(CHROM, BIN_START)
return(gt_filt)
})
# Show first 10 columns
gt_list$`20000`[, 1:10] %>%
head(.) %>%
DT::datatable(.) 6.1.3.2 List with final recoded genotypes (complete cases only)
gt_nomiss_list = purrr::map(gt_list, function(BIN_LENGTH_DF){
BIN_LENGTH_DF %>%
dplyr::filter(complete.cases(.))
})6.2 Simulate phenotype
6.2.1 Extract samples of genotypes
# Set directory
out_dir_test = file.path(lts_dir, "association_testing/20211027_test")These must be written to a file because PhenotypeSimulator reads delimited genotypes from files.
# Get random 10 loci
set.seed(10)
n_loci = 10
# NOTE: PhenotypeSimulator::readStandardGenotypes states that the genotype file must
# delim: a [delimter]-delimited file of [(NrSNPs+1) x (NrSamples+1)] genotypes with the snpIDs in the first column and the sampleIDs in the first row and genotypes encoded as numbers between 0 and 2 representing the (posterior) mean genotype, or dosage of the minor allele.
sample_gts = purrr::map(seq_along(gt_nomiss_list), function(COUNTER){
# Pull out random SNPs
snp_sample = gt_nomiss_list[[COUNTER]] %>%
dplyr::slice_sample(n = n_loci) %>%
dplyr::arrange(CHROM, BIN_START) %>%
# create locus column
dplyr::mutate(LOCUS = paste(CHROM, BIN_START, sep = ":")) %>%
# remove superfluous columns
dplyr::select(-c(CHROM, BIN, BIN_START, BIN_END)) %>%
# recode back to 0,1,2
dplyr::mutate(dplyr::across(-LOCUS,
~dplyr::recode(.x,
`-1` = 0,
`0` = 1,
`1` = 2))) %>%
# reorder columns
dplyr::select(LOCUS, everything())
})
names(sample_gts) = names(gt_nomiss_list)
purrr::map(seq_along(sample_gts), function(COUNTER){
# save to file
bin_length = names(sample_gts)[COUNTER]
out_file = file.path(out_dir_test, "target_loci", paste(bin_length, ".csv", sep = ""))
readr::write_csv(sample_gts[[COUNTER]], out_file)
})
saveRDS(sample_gts, file.path(out_dir_test, "target_loci/sample_gts.rds"))6.2.2 Simulate phenotype
sim_path = file.path(out_dir_test, "simulated_phenotypes/20211103_sim_phenos.rds")set.seed(5671)
# N samples
N = n_samples
# N phenotypes
P = 1
# Proportion of total genetic variance
genVar = 0.5
# Proportion of genetic variance of genetic variant effects
h2s = 1
# Proportion of total noise variance
noiseVar = 0.5
# Proportion of noise variance of observational noise effects
phi = 1
sim_phenos = purrr::map(seq_along(sample_gts), function(COUNTER){
# get sample file path
bin_length = names(sample_gts)[COUNTER]
gt_sample_file = file.path(out_dir_test, "target_loci", paste(bin_length, ".csv", sep = ""))
sim_pheno = PhenotypeSimulator::runSimulation(N = N, P = P,
genVar = genVar, h2s = h2s,
noiseVar = noiseVar, phi = phi,
cNrSNP = n_loci,
genotypefile = gt_sample_file,
format = "delim",
genoDelimiter = ",")
return(sim_pheno)
})
names(sim_phenos) = names(sample_gts)
saveRDS(sim_phenos, sim_path)
# Write as .xlsx to use in same Snakemake code as true GWLS
lapply(seq_along(sim_phenos), function(COUNTER){
out = tibble::tibble(fish = colnames(sample_gts[[COUNTER]])[-1],
Y = sim_phenos[[COUNTER]]$phenoComponentsFinal$Y)
# set path for output
out_file = file.path(lts_dir,
"association_testing/20211027_test/simulated_phenotypes",
paste(names(sim_phenos)[COUNTER], ".xlsx", sep = ""))
# write to file
openxlsx::write.xlsx(out, out_file, overwrite = T)
})6.3 Reformat genotypes for GridLMM
6.3.1 Complete cases
final_nomiss = purrr::map(seq_along(gt_nomiss_list), function(COUNTER){
out = list()
# Genotypes
out[["genotypes"]] = gt_nomiss_list[[COUNTER]] %>%
dplyr::select(-c(CHROM, BIN, BIN_START, BIN_END)) %>%
# convert to matrix
as.matrix(.) %>%
# transpose to put samples as rows
t(.) %>%
# convert to data frame
as.data.frame(.)
# Positions
out[["positions"]] = gt_nomiss_list[[COUNTER]] %>%
dplyr::select(CHROM, BIN_START, BIN_END)
# Phenotypes
out[["phenotypes"]] = data.frame(fish = rownames(sim_phenos[[COUNTER]]$phenoComponentsFinal$Y),
Y = sim_phenos[[COUNTER]]$phenoComponentsFinal$Y %>%
as.numeric()
)
return(out)
})
names(final_nomiss) = names(gt_nomiss_list)6.3.2 With NAs
final_wimiss = purrr::map(seq_along(gt_list), function(COUNTER){
out = list()
# Genotypes
out[["genotypes"]] = gt_list[[COUNTER]] %>%
dplyr::select(-c(CHROM, BIN, BIN_START, BIN_END)) %>%
# convert to matrix
as.matrix(.) %>%
# transpose to put samples as rows
t(.) %>%
# convert to data frame
as.data.frame(.)
# Positions
out[["positions"]] = gt_list[[COUNTER]] %>%
dplyr::select(CHROM, BIN_START, BIN_END)
# Phenotypes
out[["phenotypes"]] = data.frame(SAMPLE = rownames(sim_phenos[[COUNTER]]$phenoComponentsFinal$Y),
Y = sim_phenos[[COUNTER]]$phenoComponentsFinal$Y %>%
as.numeric()
)
return(out)
})
names(final_wimiss) = names(gt_list)6.4 Test GridLMM
6.4.1 No missing
6.4.1.1 Run GWAS
test_out_nomiss = file.path(out_dir_test, "gwls_results", "gwas_results_nomiss.rds")gwas_tests_nomiss = purrr::map(final_nomiss, function(BIN_LENGTH){
run_gwas(d = BIN_LENGTH$genotypes,
m = BIN_LENGTH$positions,
p = BIN_LENGTH$phenotypes)
})
saveRDS(gwas_tests_nomiss, test_out_nomiss)6.4.1.2 Plot
# Create custom Manhattan plot
test_man_nomiss = lapply(seq_along(gwas_tests_nomiss), function(COUNTER){
# Get bin length
BIN_LENGTH = names(gwas_tests_nomiss)[COUNTER] %>%
as.numeric(.)
# Clean data frame
test_results = gwas_tests_nomiss[[COUNTER]]$results %>%
dplyr::left_join(med_chr_lens, by = c("Chr" = "chr")) %>%
# add x-coord
dplyr::mutate(X_COORD = pos + TOT) %>%
# change column names
dplyr::rename(CHROM = Chr, BIN_START = pos) %>%
# add BIN_END
dplyr::mutate(BIN_END = BIN_START + BIN_LENGTH - 1) %>%
# add locus
dplyr::mutate(LOCUS = paste(CHROM, BIN_START, sep = ":")) %>%
# target or not
dplyr::mutate(TARGET = dplyr::if_else(LOCUS %in% sample_gts[[COUNTER]]$LOCUS,
"yes",
"no"),
TARGET = factor(TARGET, levels = c("yes", "no"))) %>%
# create vector of colours
dplyr::mutate(COLOUR = dplyr::case_when(TARGET == "yes" ~ names(gwas_pal)[1],
gtools::even(CHROM) ~ names(gwas_pal)[2],
gtools::odd(CHROM) ~ names(gwas_pal)[3]),
# order so that `target` is plotted last, at the front
COLOUR = factor(COLOUR, levels = rev(names(gwas_pal))),
SHAPE = dplyr::if_else(TARGET == "yes",
18,
20),
SIZE = dplyr::if_else(TARGET == "yes",
1,
0.5),
ALPHA = dplyr::if_else(TARGET == "yes",
1,
0.5)
)
# Plot
p1 = test_results %>%
ggplot(aes(x = X_COORD,
y = -log10(p_value_REML),
colour = COLOUR,
shape = SHAPE,
size = SIZE,
alpha = ALPHA,
label = CHROM,
label2 = BIN_START,
label3 = BIN_END)) +
geom_point() +
aes(group = rev(TARGET)) +
scale_color_manual(values = gwas_pal) +
scale_shape_identity() +
scale_size_identity() +
scale_alpha_identity() +
scale_x_continuous(breaks = med_chr_lens$MID_TOT,
labels = med_chr_lens$chr) +
theme_bw() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
) +
guides(colour = "none") +
ggtitle(paste("Bin length:", BIN_LENGTH)) +
xlab("Chromosome") +
ylab("-log10(p-value)") +
geom_hline(yintercept = significance_line, colour = "#AAF683", linetype = "dashed") +
geom_hline(yintercept = suggestive_line, colour = "#60D394", linetype = "dashed")
out = ggplotly(p1, tooltip = c("CHROM", "BIN_START", "BIN_END"))
return(out)
})6.4.2 Include loci with missing genotypes
6.4.2.1 Run GWAS
test_out_wimiss = file.path(out_dir_test, "gwls_results", "gwas_results_wimiss.rds")gwas_tests_wimiss = purrr::map(final_wimiss, function(BIN_LENGTH){
run_gwas(d = BIN_LENGTH$genotypes,
m = BIN_LENGTH$positions,
p = BIN_LENGTH$phenotypes)
})
saveRDS(gwas_tests_wimiss, test_out_wimiss)6.4.2.2 Read in results from Snakemake script
gwas_tests_wimiss = purrr::map(bin_lengths, function(BIN_LENGTH){
readRDS(file.path(lts_dir, "association_testing/20211104_test/test_results", paste(BIN_LENGTH, ".rds", sep = "")))
})6.4.2.3 Plot
# Create custom Manhattan plot
gwas_pal = c("#2B2D42", "#F7B267", "#F25C54")
names(gwas_pal) = c("target", "even chr", "odd chr")
significance_line = 3.6
suggestive_line = 2.9
test_man_wimiss = purrr::map(seq_along(gwas_tests_wimiss), function(COUNTER){
# Get bin length
BIN_LENGTH = names(gwas_tests_wimiss)[COUNTER] %>%
as.numeric(.)
# Clean data frame
test_results = gwas_tests_wimiss[[COUNTER]]$results %>%
dplyr::left_join(med_chr_lens, by = c("Chr" = "chr")) %>%
# add x-coord
dplyr::mutate(X_COORD = pos + TOT) %>%
# change column names
dplyr::rename(CHROM = Chr, BIN_START = pos) %>%
# add BIN_END
dplyr::mutate(BIN_END = BIN_START + BIN_LENGTH - 1) %>%
# add locus
dplyr::mutate(LOCUS = paste(CHROM, BIN_START, sep = ":")) %>%
# target or not
dplyr::mutate(TARGET = dplyr::if_else(LOCUS %in% sample_gts[[COUNTER]]$LOCUS,
"yes",
"no"),
TARGET = factor(TARGET, levels = c("yes", "no"))) %>%
# create vector of colours
dplyr::mutate(COLOUR = dplyr::case_when(TARGET == "yes" ~ names(gwas_pal)[1],
gtools::even(CHROM) ~ names(gwas_pal)[2],
gtools::odd(CHROM) ~ names(gwas_pal)[3]),
# order so that `target` is plotted last, at the front
COLOUR = factor(COLOUR, levels = rev(names(gwas_pal))),
SHAPE = dplyr::if_else(TARGET == "yes",
18,
20),
SIZE = dplyr::if_else(TARGET == "yes",
1,
0.5),
ALPHA = dplyr::if_else(TARGET == "yes",
1,
0.5)
)
# Plot
p1 = test_results %>%
ggplot(aes(x = X_COORD,
y = -log10(p_value_REML),
colour = COLOUR,
shape = SHAPE,
size = SIZE,
alpha = ALPHA,
label = CHROM,
label2 = BIN_START,
label3 = BIN_END)) +
geom_point() +
aes(group = rev(TARGET)) +
scale_color_manual(values = gwas_pal) +
scale_shape_identity() +
scale_size_identity() +
scale_alpha_identity() +
scale_x_continuous(breaks = med_chr_lens$MID_TOT,
labels = med_chr_lens$chr) +
theme_bw() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
) +
guides(colour = "none") +
ggtitle(paste("Bin length:", BIN_LENGTH)) +
xlab("Chromosome") +
ylab("-log10(p-value)") +
geom_hline(yintercept = significance_line, colour = "#AAF683", linetype = "dashed") +
geom_hline(yintercept = suggestive_line, colour = "#60D394", linetype = "dashed")
out = ggplotly(p1, tooltip = c("CHROM", "BIN_START", "BIN_END"))
return(out)
})
test_man_wimiss[[1]]